#we upload the dataset
total_500 <- read.csv("~/GitHub/thesis_msc_business_analytics/Python/total_500_new.csv", sep=";", na.strings="n/a")
#we see how many observations and how many variables we have
dim(total_500)
## [1] 500 730
#We create a subset to make some changes to the data
total_500_sub <- total_500
#Change the decimal point for the 4 variables
total_500_sub$Assets.. <- gsub(",", ".", total_500_sub$Assets.. )
total_500_sub$Market.value.. <- gsub(",", ".", total_500_sub$Market.value.. )
total_500_sub$Revenues.. <- gsub(",", ".", total_500_sub$Revenues.. )
total_500_sub$Total.Stockholder.Equity.. <- gsub(",", ".", total_500_sub$Total.Stockholder.Equity.. )
#Make the variables numeric
for(i in 1:18){
total_500_sub[,i] <- as.numeric(total_500_sub[,i])}
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
for(i in 20:730){
total_500_sub[,i] <- as.numeric(total_500_sub[,i])}
#We omit the nas from the analysis
total_500_final <- na.omit(total_500_sub)
#We rename variable X as Ranking
colnames(total_500_final)[1] <- "Ranking"
#Change the names of some variables to be more easily readable
colnames(total_500_final)[2] <- "Assets"
colnames(total_500_final)[3] <- "Market_Value"
colnames(total_500_final)[4] <- "Revenues"
colnames(total_500_final)[6] <- "Total_SH_Equity"
#Delete the variables we will not need
total_500_final$Revenues...1 <- NULL #Revenues %
total_500_final$company <- NULL #company name
total_500_final$url<- NULL # company url
#we upload the libraries beneath that we will use in the analysis
library(ggplot2)
library(reshape2)
library(DAAG)
## Loading required package: lattice
#Final number of observation and variables we will use
dim(total_500_final)
## [1] 408 727
#######################################################################################################
#we first see the summary of the Fortune variables and then we create their histogram so as to have a
#good grasp of how they are distributed
ggplot(data=total_500_final,aes(x=Revenues))+geom_histogram(binwidth=50, colour = "green", fill ="darkgreen")

ggplot(data=total_500_final,aes(x=Assets))+geom_histogram(binwidth=100, colour = "red", fill ="darkred")

ggplot(data=total_500_final,aes(x=Market_Value))+geom_histogram(binwidth=100, colour = "blue", fill ="darkblue")

ggplot(data=total_500_final,aes(x=Total_SH_Equity))+geom_histogram(binwidth=100, colour = "purple", fill ="pink")

###############################################################################################
#We make plots to see how the variables we got from Fortune 500 are related with the Ranking
ggplot(total_500_final, aes(Assets,Ranking)) + geom_point(colour = "red")

ggplot(total_500_final, aes(Market_Value, Ranking)) + geom_point(colour = "blue")

ggplot(total_500_final, aes(Total_SH_Equity, Ranking)) + geom_point(colour = "purple")

ggplot(total_500_final, aes(Revenues, Ranking)) + geom_point(colour = "green")

#We can see that the Ranking has a linear relationship with the Revenues so we will use one of those 2 variables to check the relationships with the websites metrics
#In order to have a more clear look we also create a correlation diagram
total_500_fortune <- total_500_final[,c(1:5)]
library(corrplot)
library(caret)
sm <- cor(total_500_fortune)
sm
## Ranking Assets Market_Value Revenues
## Ranking 1.0000000 -0.36673307 -0.15959008 -0.67511457
## Assets -0.3667331 1.00000000 0.16787320 0.43479882
## Market_Value -0.1595901 0.16787320 1.00000000 0.31085660
## Revenues -0.6751146 0.43479882 0.31085660 1.00000000
## Total_SH_Equity 0.1327272 -0.03638159 -0.02912268 -0.05616772
## Total_SH_Equity
## Ranking 0.13272724
## Assets -0.03638159
## Market_Value -0.02912268
## Revenues -0.05616772
## Total_SH_Equity 1.00000000
corrplot(cor(total_500_fortune),method="number")

#From this plot we understand that the Ranking and the Revenues have very high correlation.
##########################################################################################################
#Firstly we will analyze the social media relevance with the sites.
#We will see how many of the sites have social media and what type of social media
#Facebook
social_media_facebook <- round(table(total_500_final$facebook)/408,3)
social_media_facebook
##
## 0 1
## 0.353 0.647
slicelable <- c(paste(35.3,"% no"),paste(64.7,"% yes"))
pie(social_media_facebook,label = slicelable,main="Share of companies with Facebook",col=rainbow(length(social_media_facebook)))

ggplot(total_500_final, aes(Revenues, facebook)) + geom_point(size=3, colour = "darkblue")

#Twitter
social_media_twitter <- round(table(total_500_final$twitter)/408,3)
social_media_twitter
##
## 0 1
## 0.314 0.686
slicelable <- c(paste(31.4,"% no"),paste(68.6,"% yes"))
pie(social_media_twitter,label = slicelable,main="Share of companies with Twitter",col=rainbow(length(social_media_twitter)))

ggplot(total_500_final, aes(Revenues, twitter)) + geom_point(size=3, colour = "darkgreen")

#Instagram
social_media_instagram <- round(table(total_500_final$instagram)/408,3)
social_media_instagram
##
## 0 1
## 0.777 0.223
slicelable <- c(paste(77.7,"% no"),paste(22.3,"% yes"))
pie(social_media_instagram,label = slicelable,main="Share of companies with Instagram",col=rainbow(length(social_media_instagram)))

ggplot(total_500_final, aes(Revenues, instagram)) + geom_point(size=3, colour = "pink")

#Pinterest
social_media_pinterest <- round(table(total_500_final$pinterest)/408,3)
social_media_pinterest
##
## 0 1
## 0.902 0.098
slicelable <- c(paste(90.2,"% no"),paste(9.8,"% yes"))
pie(social_media_pinterest,label = slicelable,main="Share of companies with Pinterest",col=rainbow(length(social_media_pinterest)))

ggplot(total_500_final, aes(Revenues, pinterest)) + geom_point(size=3, colour = "darkred")

#Youtube
social_media_youtube <- round(table(total_500_final$youtube)/408,3)
social_media_youtube
##
## 0 1
## 0.417 0.583
slicelable <- c(paste(41.7,"% no"),paste(58.3,"% yes"))
pie(social_media_youtube,label = slicelable,main="Share of companies with Youtube",col=rainbow(length(social_media_youtube)))

ggplot(total_500_final, aes(Revenues, youtube)) + geom_point(size=3, colour = "red")

#LinkedIn
social_media_linkedin <- round(table(total_500_final$linkedin)/408,3)
social_media_linkedin
##
## 0 1
## 0.429 0.571
slicelable <- c(paste(42.9,"% no"),paste(57.1,"% yes"))
pie(social_media_linkedin,label = slicelable,main="Share of companies with Linkedin",col=rainbow(length(social_media_linkedin)))

ggplot(total_500_final, aes(Revenues, linkedin)) + geom_point(size=3, colour = "blue")

#And we can also see for correlations
total_500_social_media <- total_500_final[,c(4,10:15)]
library(corrplot)
library(caret)
sm <- cor(total_500_social_media)
sm
## Revenues facebook instagram linkedin pinterest
## Revenues 1.000000000 0.01121852 0.05771665 -0.008311532 0.09686843
## facebook 0.011218524 1.00000000 0.35874256 0.520581725 0.24349238
## instagram 0.057716654 0.35874256 1.00000000 0.143134960 0.37774489
## linkedin -0.008311532 0.52058172 0.14313496 1.000000000 -0.03069495
## pinterest 0.096868426 0.24349238 0.37774489 -0.030694951 1.00000000
## twitter 0.002185367 0.67230226 0.32419034 0.577378625 0.20514804
## youtube 0.074833925 0.54096275 0.32145351 0.482997415 0.19504737
## twitter youtube
## Revenues 0.002185367 0.07483393
## facebook 0.672302259 0.54096275
## instagram 0.324190344 0.32145351
## linkedin 0.577378625 0.48299741
## pinterest 0.205148042 0.19504737
## twitter 1.000000000 0.52142857
## youtube 0.521428571 1.00000000
corrplot(cor(total_500_social_media),method="number")

#we see that facebook has correlation more than 50% with twitter, youtube and linkedin
#And that the smallest correlations are those of pinterest and instagram
#########################################################################################################
#We will now check the links by creating an histogram
#Then we create ggplots in order to see in what frequency the links appear
par(mfrow=c(1,1))
library(ggplot2)
ggplot(data=total_500_final,aes(x=total.links))+geom_histogram(binwidth=50, colour = "darkblue", fill ="blue")

ggplot(total_500_final, aes(Revenues, total.links)) + geom_point(size=3, colour = "darkblue")

ggplot(data=total_500_final,aes(x=external))+geom_histogram(binwidth=50, colour = "darkred", fill ="red")

ggplot(total_500_final, aes(Revenues, external)) + geom_point(size=3, colour = "darkred")

ggplot(data=total_500_final,aes(x=internal))+geom_histogram(binwidth=50, colour = "darkgreen", fill ="green")

ggplot(total_500_final, aes(Revenues, internal)) + geom_point(size=3, colour = "darkgreen")

#And we can also see for correlations
total_500_links <- total_500_final[,c(4,21:23)]
library(corrplot)
library(caret)
tl <- cor(total_500_links)
tl
## Revenues external internal total.links
## Revenues 1.00000000 0.034100506 0.004559950 0.01538199
## external 0.03410051 1.000000000 -0.002593961 0.32202419
## internal 0.00455995 -0.002593961 1.000000000 0.94589294
## total.links 0.01538199 0.322024191 0.945892937 1.00000000
corrplot(cor(total_500_links),method="number")

#We can see that the total links with the internal links have a correlation almost 95%.
#So we will not include the total links in the regression model
#########################################################################################################
#Now we will see the loading time per site
ggplot(data=total_500_final,aes(x=loading.time))+geom_histogram(binwidth=1, colour = "pink", fill ="purple")

ggplot(total_500_final, aes(Revenues, loading.time)) + geom_point(size=3, colour = "purple")

#########################################################################################################
#Now we will see the total words, the unique words and the sentences how are distributed alone and in relationhsip with the revenues.
ggplot(data=total_500_final,aes(x=Sentences))+geom_histogram(binwidth=50, colour = "darkred", fill ="red")

ggplot(total_500_final, aes(Revenues, Sentences)) + geom_point(size=3, colour = "purple")

#########################
ggplot(data=total_500_final,aes(x=Unique.words))+geom_histogram(binwidth=50, colour = "darkred", fill ="red")

ggplot(total_500_final, aes(Revenues, Unique.words)) + geom_point(size=3, colour = "purple")

#########################
ggplot(data=total_500_final,aes(x=Words))+geom_histogram(binwidth=50, colour = "darkred", fill ="red")

ggplot(total_500_final, aes(Revenues, Words)) + geom_point(size=3, colour = "purple")

#############################
#And we can also see for correlations
total_500_lt_w <- total_500_final[,c(4,18:20,727)]
library(corrplot)
library(caret)
tl <- cor(total_500_lt_w)
tl
## Revenues Sentences Unique.words Words loading.time
## Revenues 1.00000000 -0.01183819 -0.04362118 -0.03479049 -0.1212650
## Sentences -0.01183819 1.00000000 0.69454327 0.78851979 0.1497520
## Unique.words -0.04362118 0.69454327 1.00000000 0.93243940 0.1994296
## Words -0.03479049 0.78851979 0.93243940 1.00000000 0.1857922
## loading.time -0.12126500 0.14975205 0.19942956 0.18579225 1.0000000
corrplot(cor(total_500_lt_w),method="number")

################################
#Next we will check the Flesh Measure alone and in relationship with revenues
ggplot(data=total_500_final,aes(x=Flesh_Mesaure))+geom_histogram(binwidth=50, colour = "darkred", fill ="red")

ggplot(total_500_final, aes(Revenues, Flesh_Mesaure)) + geom_point(size=3, colour = "purple")

############################
total_500_final$Readability <- gsub("Very easy", "01_VE", total_500_final$Readability )
total_500_final$Readability <- gsub("Easy", "02_E", total_500_final$Readability )
total_500_final$Readability <- gsub("Fairly easy", "03_FE", total_500_final$Readability )
total_500_final$Readability <- gsub("Standard", "04_St", total_500_final$Readability )
total_500_final$Readability <- gsub("Fairly difficult", "05_FD", total_500_final$Readability )
total_500_final$Readability <- gsub("Difficult", "06_D", total_500_final$Readability )
total_500_final$Readability <- gsub("Very Confusing", "07_VC", total_500_final$Readability )
barplot(table(total_500_final$Readability),col ="dark red")

total_500_final$Readability <- gsub("01_VE","1", total_500_final$Readability )
total_500_final$Readability <- gsub("02_E", "2", total_500_final$Readability )
total_500_final$Readability <- gsub("03_FE", "3", total_500_final$Readability )
total_500_final$Readability <- gsub("04_St", "4", total_500_final$Readability )
total_500_final$Readability <- gsub("05_FD", "5", total_500_final$Readability )
total_500_final$Readability <- gsub("06_D", "6" ,total_500_final$Readability )
total_500_final$Readability <- gsub("07_VC", "7",total_500_final$Readability )
total_500_final$Readability <- as.numeric(total_500_final$Readability )
ggplot(data=total_500_final,aes(x=Readability))+geom_bar(binwidth=1, colour = "darkred", fill ="red")
## Warning: `geom_bar()` no longer has a `binwidth` parameter. Please use
## `geom_histogram()` instead.

ggplot(total_500_final, aes(Revenues, Readability)) + geom_point(size=3, colour = "purple")

#And we can also see for correlations
total_500_r <- total_500_final[,c(4,16,17)]
library(corrplot)
library(caret)
tl <- cor(total_500_r)
tl
## Revenues Flesh_Mesaure Readability
## Revenues 1.00000000 0.02476229 -0.02694931
## Flesh_Mesaure 0.02476229 1.00000000 -0.17094994
## Readability -0.02694931 -0.17094994 1.00000000
corrplot(cor(total_500_r),method="number")

#########################################################################################################
#Now we will see the number of errors and warnings alone and in relationship with the Revenues
ggplot(data=total_500_final,aes(x=number_of_errors))+geom_histogram(binwidth=50, colour = "red")

ggplot(total_500_final, aes(Revenues, number_of_errors)) + geom_point(size=3, colour = "dark red")

ggplot(data=total_500_final,aes(x=number_of_warning))+geom_histogram(binwidth=20, colour = "red")

ggplot(total_500_final, aes(Revenues, number_of_warning)) + geom_point(size=3, colour = "dark blue")

#########################################################################################################
#########################################################################################################
#Now we will see the non.document.error and the page not opened variables alone and in relationship with the Revenues
ggplot(data=total_500_final,aes(x=non.document.error))+geom_histogram(binwidth=1, colour = "red")

ggplot(total_500_final, aes(Revenues, non.document.error)) + geom_point(size=1, colour = "dark red")

ggplot(data=total_500_final,aes(x=The_page_opened))+geom_histogram(binwidth=1, colour = "red")

ggplot(total_500_final, aes(Revenues, The_page_opened)) + geom_point(size=3, colour = "dark blue")

#In the page not opened we can see that the variable has only the price 1 that means that the page opened so there is no point in using it in the analysis as it does not affect the outcome
#########################################################################################################
#And we can also see for correlations
total_500_html <- total_500_final[,c(4,7:9)]
library(corrplot)
library(caret)
tl <- cor(total_500_html)
tl
## Revenues non.document.error number_of_errors
## Revenues 1.00000000 -0.0748407 0.0800205
## non.document.error -0.07484070 1.0000000 -0.2545301
## number_of_errors 0.08002050 -0.2545301 1.0000000
## number_of_warning 0.09505013 -0.2242315 0.2309578
## number_of_warning
## Revenues 0.09505013
## non.document.error -0.22423152
## number_of_errors 0.23095778
## number_of_warning 1.00000000
corrplot(cor(total_500_html),method="number")

#Now we will see the total images alone and in relationship with the revenues
ggplot(data=total_500_final,aes(x=total.images))+geom_histogram(binwidth=100, colour = "darkred", fill ="red")

ggplot(total_500_final, aes(Revenues, total.images)) + geom_point(size=3, colour = "dark blue")

#########################################################################################################
#We will see now the frequency of image types that is being used
par(mfrow=c(1,1))
k = c(717:725)
for(i in 1:9){
a <- k[i]
image_type<- round(table(total_500_final[,a])/408,3)
barplot(image_type,xlab=names(total_500_final)[a],ylab = "Shares of images per site", col = "dark green")}









#It is obvious that the most common images type are .jpg, gif and .png
#We will check now the types in relationship with the revenues
ggplot(total_500_final, aes(Revenues, .bmp)) + geom_point(size=3, colour = "dark blue")

ggplot(total_500_final, aes(Revenues, .dib)) + geom_point(size=3, colour = "dark blue")

ggplot(total_500_final, aes(Revenues, .gif)) + geom_point(size=3, colour = "dark blue")

ggplot(total_500_final, aes(Revenues, .jpe)) + geom_point(size=3, colour = "dark blue")

ggplot(total_500_final, aes(Revenues, .jpeg)) + geom_point(size=3, colour = "dark blue")

ggplot(total_500_final, aes(Revenues, .jpg)) + geom_point(size=3, colour = "dark blue")

ggplot(total_500_final, aes(Revenues, .png)) + geom_point(size=3, colour = "dark blue")

ggplot(total_500_final, aes(Revenues, .tif)) + geom_point(size=3, colour = "dark blue")

ggplot(total_500_final, aes(Revenues, .tiff)) + geom_point(size=3, colour = "dark blue")

#And we can also see for correlations
total_500_im<- total_500_final[,c(4,717:726)]
library(corrplot)
library(caret)
tl <- cor(total_500_im)
tl
## Revenues .bmp .dib .gif
## Revenues 1.000000000 0.083489281 0.0877047067 -0.020930575
## .bmp 0.083489281 1.000000000 -0.0013011275 -0.005780172
## .dib 0.087704707 -0.001301127 1.0000000000 0.196433219
## .gif -0.020930575 -0.005780172 0.1964332192 1.000000000
## .jpe 0.059660288 -0.003534780 0.9108504455 0.235836802
## .jpeg 0.059427022 -0.003482535 0.9108963639 0.236016392
## .jpg 0.004515870 0.013995542 0.0081915788 -0.006939606
## .png 0.022053238 -0.001301001 0.2204575537 0.164538688
## .tif -0.002466003 0.050383371 0.0628334455 0.031167772
## .tiff -0.030165817 -0.005659409 -0.0008571371 0.007252077
## total.images 0.051605339 0.018395189 0.7961253175 0.319131935
## .jpe .jpeg .jpg .png
## Revenues 0.059660288 0.059427022 0.004515870 0.022053238
## .bmp -0.003534780 -0.003482535 0.013995542 -0.001301001
## .dib 0.910850445 0.910896364 0.008191579 0.220457554
## .gif 0.235836802 0.236016392 -0.006939606 0.164538688
## .jpe 1.000000000 0.999991326 -0.008220505 0.231367993
## .jpeg 0.999991326 1.000000000 -0.008242305 0.231422560
## .jpg -0.008220505 -0.008242305 1.000000000 0.244033499
## .png 0.231367993 0.231422560 0.244033499 1.000000000
## .tif 0.007431086 0.007587510 0.375321187 0.259001712
## .tiff -0.005630343 -0.005548159 0.224429140 0.040635288
## total.images 0.855175392 0.855225420 0.413319367 0.529706228
## .tif .tiff total.images
## Revenues -0.002466003 -0.0301658169 0.05160534
## .bmp 0.050383371 -0.0056594093 0.01839519
## .dib 0.062833445 -0.0008571371 0.79612532
## .gif 0.031167772 0.0072520772 0.31913194
## .jpe 0.007431086 -0.0056303426 0.85517539
## .jpeg 0.007587510 -0.0055481589 0.85522542
## .jpg 0.375321187 0.2244291400 0.41331937
## .png 0.259001712 0.0406352880 0.52970623
## .tif 1.000000000 0.0222123897 0.34887215
## .tiff 0.022212390 1.0000000000 0.07827891
## total.images 0.348872154 0.0782789113 1.00000000
corrplot(cor(total_500_im),method="number")

#We will see now the frequency of image sizes that is being used
k = c()
#Check for sizes that are half and half divided in existing and not
for(i in 24:716){
image_size<- round(table(total_500_final[,i]))
if ((image_size[[1]]==408)==TRUE){
k <- union(k, c(i))
}}
#####################
#Number 24 is all onw price so we want use it
names(total_500_final)[24]
## [1] "X144x144"
total_500_final$X144x144 <- NULL
false_not_existing = c()
#Check for sizes that are less than half divided in existing and not
for(i in 24:715){
image_size<- round(table(total_500_final[,i]))
if ((image_size[[2]]<204)==TRUE){
false_not_existing <- union(false_not_existing, c(i))
}}
########################
#Now we will take the sizes that exist in less than half the instances and check graphically the deviations between the 408 sites
par(mfrow=c(3,3))
for(i in 1:416){
a = false_not_existing[i]
plot(total_500_final[,a],total_500_final$Revenues)
image_size<- round(table(total_500_final[,a]))
barplot(image_size,xlab=names(total_500_final)[a],ylab = "Has or not the size", col = "dark green")}





























































































true_existing = c()
#Check for sizes that are more than half divided in existing and not
for(i in 24:715){
image_size<- round(table(total_500_final[,i]))
if ((image_size[[2]]>204)==TRUE){
true_existing <- union(true_existing, c(i))
}}
#Now we will take the sizes that exist in more than half the instances and check graphically the deviations between the 408 sites
par(mfrow=c(3,3))
for(i in 1:276){
a = true_existing[i]
image_size<- round(table(total_500_final[,a]))
plot(total_500_final[,a],total_500_final$Revenues)
barplot(image_size,xlab=names(total_500_final)[a],ylab = "Has or not the size", col = "dark green")}






























































#By checking the above plots we can see that the 24 first sizes do appear to have some differentiation regarding the revenues. While most sites do have those sizes when it comes to the high revienues they do not have them
par(mfrow=c(3,3))
keep = c()
for(i in 1:24){
a = true_existing[i]
keep = union (keep, c(a))}
keep
## [1] 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
## [24] 47
#As we can see they are the variables from 24 to 47 and these are the only sizes we are going to keep for the further analysis
total_500_final <- total_500_final[,-c(48:715)]
#Also we remove the other Fortune 500 variables since they will interfer in the outcome of the model and we keep only the variable we want to examine the Revenues
total_500_final$Market_Value <- NULL
total_500_final$Assets <- NULL
total_500_final$Ranking <- NULL
total_500_final$Total_SH_Equity <- NULL
total_500_final$The_page_opened <- NULL
#We split the set to training and test set
library(caret)
set.seed(20)
sampling_vector <- createDataPartition(total_500_final$Revenues, p = 0.85, list = FALSE)
total_500_final_train <- total_500_final[sampling_vector,]
total_500_final_test <- total_500_final[-sampling_vector,]
#We will try to create a regression model to see which of the variables of the websites play the most important part regarding the Ranking of the company.
#We create the empty lm model
model_null = lm(Revenues~1,data=total_500_final_train)
summary(model_null)
##
## Call:
## lm(formula = Revenues ~ 1, data = total_500_final_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.16 -15.24 -11.17 -1.43 211.43
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.288 1.703 13.09 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31.77 on 347 degrees of freedom
#####################################################################################################
#LASSO and Logistic Regression models
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-2
#We create a full model for the variable Ranking
full <- lm(Revenues~.,data=total_500_final_train)
summary(full)
##
## Call:
## lm(formula = Revenues ~ ., data = total_500_final_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.199 -8.486 -3.461 2.223 65.721
##
## Coefficients: (14 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 438.837989 29.754450 14.749 < 2e-16 ***
## non.document.error -0.973835 1.984634 -0.491 0.623996
## number_of_errors 0.012911 0.010948 1.179 0.239167
## number_of_warning -0.020989 0.038979 -0.538 0.590653
## facebook -3.004443 2.410328 -1.246 0.213530
## instagram 2.605986 2.234726 1.166 0.244460
## linkedin -0.325461 2.191702 -0.148 0.882047
## pinterest 1.109936 3.221429 0.345 0.730669
## twitter 1.567012 2.451651 0.639 0.523189
## youtube 2.272481 2.072409 1.097 0.273696
## Flesh_Mesaure 0.001529 0.004208 0.363 0.716631
## Readability 1.613550 0.770141 2.095 0.036973 *
## Sentences -0.006214 0.010664 -0.583 0.560540
## Unique.words -0.005883 0.019639 -0.300 0.764725
## Words 0.002363 0.004897 0.482 0.629833
## external 0.004634 0.017992 0.258 0.796924
## internal 0.001545 0.007942 0.195 0.845906
## total.links NA NA NA NA
## X15x75 -20.966921 20.586365 -1.018 0.309244
## X8x15 -30.814241 20.765759 -1.484 0.138856
## X44x556 -17.975428 20.815629 -0.864 0.388503
## X1x1 NA NA NA NA
## X800x1200 -9.979089 18.025681 -0.554 0.580250
## autox100. -3.048954 18.426054 -0.165 0.868682
## X24pxx133px -15.566911 18.548403 -0.839 0.401973
## X21pxx173px NA NA NA NA
## X46x214 NA NA NA NA
## X49x49 NA NA NA NA
## X50x45 -6.206753 18.046674 -0.344 0.731134
## X400x300 -12.942588 18.222831 -0.710 0.478091
## X292pxx292px -6.790009 14.896169 -0.456 0.648838
## X200pxx200px NA NA NA NA
## X1279pxx984px NA NA NA NA
## X300pxx1500px NA NA NA NA
## X29x29 -7.177375 14.533766 -0.494 0.621769
## X115x223 -8.312448 18.150382 -0.458 0.647291
## X160x233 NA NA NA NA
## X300x993 NA NA NA NA
## X41x192 NA NA NA NA
## X28x221 NA NA NA NA
## X15x12 NA NA NA NA
## X60x60 -75.505052 14.921561 -5.060 7.2e-07 ***
## .bmp 2.398250 0.633335 3.787 0.000183 ***
## .dib 0.316324 1.107796 0.286 0.775419
## .gif -0.064718 0.066521 -0.973 0.331361
## .jpe -0.610764 4.148684 -0.147 0.883055
## .jpeg 0.640994 4.149986 0.154 0.877350
## .jpg 0.007073 0.024010 0.295 0.768499
## .png -0.011736 0.031687 -0.370 0.711358
## .tif -0.009165 0.042009 -0.218 0.827451
## .tiff -3.166227 5.182089 -0.611 0.541652
## total.images NA NA NA NA
## loading.time -5.750583 2.043199 -2.814 0.005199 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.37 on 309 degrees of freedom
## Multiple R-squared: 0.8179, Adjusted R-squared: 0.7955
## F-statistic: 36.52 on 38 and 309 DF, p-value: < 2.2e-16
x <- model.matrix(full) [,-1]
dim(x)
## [1] 348 52
lasso <- glmnet (x, total_500_final_train$Revenues)
par(mfrow=c(1,1),no.readonly = TRUE)
plot(lasso, xvar='lambda', label=T)

lassob <- cv.glmnet(x,total_500_final_train$Revenues)
lassob$lambda.min
## [1] 1.497886
lassob$lambda.1se
## [1] 6.046992
plot(lassob)

#We see the coefficients for lamda min
blasso <- coef(lassob, s="lambda.min")
blasso
## 53 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 3.947708e+02
## non.document.error .
## number_of_errors .
## number_of_warning .
## facebook .
## instagram .
## linkedin .
## pinterest .
## twitter .
## youtube .
## Flesh_Mesaure .
## Readability .
## Sentences .
## Unique.words .
## Words .
## external .
## internal .
## total.links .
## X15x75 -6.454344e+00
## X8x15 -2.581794e+01
## X44x556 -2.274898e+01
## X1x1 -1.817242e-10
## X800x1200 -4.024409e+00
## autox100. -2.731837e+00
## X24pxx133px -1.264747e+01
## X21pxx173px -8.311505e-14
## X46x214 -2.509133e-14
## X49x49 -1.515554e-01
## X50x45 -4.483822e+00
## X400x300 -1.735377e+01
## X292pxx292px -7.628580e+00
## X200pxx200px .
## X1279pxx984px .
## X300pxx1500px .
## X29x29 -5.198336e+00
## X115x223 -1.075696e+01
## X160x233 -1.736363e-14
## X300x993 -3.268447e-14
## X41x192 .
## X28x221 .
## X15x12 .
## X60x60 -6.896751e+01
## .bmp 1.353565e+00
## .dib .
## .gif .
## .jpe 9.091761e-03
## .jpeg .
## .jpg .
## .png .
## .tif .
## .tiff .
## total.images .
## loading.time -1.663331e+00
dim(blasso)
## [1] 53 1
zblasso <- blasso[-1] * apply(x,2,sd)
zbolt <- coef (full) [-1] * apply (x,2,sd)
azbolt <- abs(zbolt)
sum(azbolt)
## [1] NA
#since the sum is NA that means we have to substract some variables
# in order to find which variables to substract we run the coefficients and we see which of them has NA as result
coef(full)
## (Intercept) non.document.error number_of_errors
## 438.837988834 -0.973834742 0.012911122
## number_of_warning facebook instagram
## -0.020988515 -3.004442960 2.605986169
## linkedin pinterest twitter
## -0.325461120 1.109935889 1.567011663
## youtube Flesh_Mesaure Readability
## 2.272481175 0.001528797 1.613549502
## Sentences Unique.words Words
## -0.006213607 -0.005882824 0.002362520
## external internal total.links
## 0.004633997 0.001544738 NA
## X15x75 X8x15 X44x556
## -20.966920757 -30.814240798 -17.975427612
## X1x1 X800x1200 autox100.
## NA -9.979088664 -3.048953972
## X24pxx133px X21pxx173px X46x214
## -15.566910594 NA NA
## X49x49 X50x45 X400x300
## NA -6.206752976 -12.942588163
## X292pxx292px X200pxx200px X1279pxx984px
## -6.790009197 NA NA
## X300pxx1500px X29x29 X115x223
## NA -7.177375043 -8.312447514
## X160x233 X300x993 X41x192
## NA NA NA
## X28x221 X15x12 X60x60
## NA NA -75.505051947
## .bmp .dib .gif
## 2.398249510 0.316324310 -0.064718207
## .jpe .jpeg .jpg
## -0.610763627 0.640994242 0.007073258
## .png .tif .tiff
## -0.011735917 -0.009164584 -3.166227158
## total.images loading.time
## NA -5.750583435
#Now we create a new model with only the variables with coef different from NA
full_2 <- lm(Revenues~. - total.images - total.links - X1x1 - X21pxx173px - X46x214 - X49x49 - X200pxx200px - X1279pxx984px - X300pxx1500px - X160x233 - X300x993 - X41x192 - X28x221 - X15x12,data=total_500_final_train)
summary(full_2)
##
## Call:
## lm(formula = Revenues ~ . - total.images - total.links - X1x1 -
## X21pxx173px - X46x214 - X49x49 - X200pxx200px - X1279pxx984px -
## X300pxx1500px - X160x233 - X300x993 - X41x192 - X28x221 -
## X15x12, data = total_500_final_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.199 -8.486 -3.461 2.223 65.721
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 438.837989 29.754450 14.749 < 2e-16 ***
## non.document.error -0.973835 1.984634 -0.491 0.623996
## number_of_errors 0.012911 0.010948 1.179 0.239167
## number_of_warning -0.020989 0.038979 -0.538 0.590653
## facebook -3.004443 2.410328 -1.246 0.213530
## instagram 2.605986 2.234726 1.166 0.244460
## linkedin -0.325461 2.191702 -0.148 0.882047
## pinterest 1.109936 3.221429 0.345 0.730669
## twitter 1.567012 2.451651 0.639 0.523189
## youtube 2.272481 2.072409 1.097 0.273696
## Flesh_Mesaure 0.001529 0.004208 0.363 0.716631
## Readability 1.613550 0.770141 2.095 0.036973 *
## Sentences -0.006214 0.010664 -0.583 0.560540
## Unique.words -0.005883 0.019639 -0.300 0.764725
## Words 0.002363 0.004897 0.482 0.629833
## external 0.004634 0.017992 0.258 0.796924
## internal 0.001545 0.007942 0.195 0.845906
## X15x75 -20.966921 20.586365 -1.018 0.309244
## X8x15 -30.814241 20.765759 -1.484 0.138856
## X44x556 -17.975428 20.815629 -0.864 0.388503
## X800x1200 -9.979089 18.025681 -0.554 0.580250
## autox100. -3.048954 18.426054 -0.165 0.868682
## X24pxx133px -15.566911 18.548403 -0.839 0.401973
## X50x45 -6.206753 18.046674 -0.344 0.731134
## X400x300 -12.942588 18.222831 -0.710 0.478091
## X292pxx292px -6.790009 14.896169 -0.456 0.648838
## X29x29 -7.177375 14.533766 -0.494 0.621769
## X115x223 -8.312448 18.150382 -0.458 0.647291
## X60x60 -75.505052 14.921561 -5.060 7.2e-07 ***
## .bmp 2.398250 0.633335 3.787 0.000183 ***
## .dib 0.316324 1.107796 0.286 0.775419
## .gif -0.064718 0.066521 -0.973 0.331361
## .jpe -0.610764 4.148684 -0.147 0.883055
## .jpeg 0.640994 4.149986 0.154 0.877350
## .jpg 0.007073 0.024010 0.295 0.768499
## .png -0.011736 0.031687 -0.370 0.711358
## .tif -0.009165 0.042009 -0.218 0.827451
## .tiff -3.166227 5.182089 -0.611 0.541652
## loading.time -5.750583 2.043199 -2.814 0.005199 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.37 on 309 degrees of freedom
## Multiple R-squared: 0.8179, Adjusted R-squared: 0.7955
## F-statistic: 36.52 on 38 and 309 DF, p-value: < 2.2e-16
x <- model.matrix(full_2) [,-c(18,22,28,26,27,34,32,33,41,37,38,39,40,52)]
dim(x)
## [1] 348 28
lasso <- glmnet (x, total_500_final_train$Revenues)
par(mfrow=c(1,1),no.readonly = TRUE)
plot(lasso, xvar='lambda', label=T)

lassob <- cv.glmnet(x,total_500_final_train$Revenues)
lassob$lambda.min
## [1] 1.804209
lassob$lambda.1se
## [1] 4.167957
plot(lassob)

#coefiecinets for lammda min
blasso <- coef(lassob, s="lambda.min")
blasso
## 29 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 380.68399196
## (Intercept) .
## non.document.error .
## number_of_errors .
## number_of_warning .
## facebook .
## instagram .
## linkedin .
## pinterest .
## twitter .
## youtube .
## Flesh_Mesaure .
## Readability .
## Sentences .
## Unique.words .
## Words .
## external .
## internal .
## X8x15 -27.76901174
## X44x556 -22.67638827
## X800x1200 -3.39121192
## X24pxx133px -14.48085601
## X50x45 -4.29553809
## X400x300 -27.12156885
## X60x60 -82.43258867
## .bmp 1.11787313
## .dib 0.04443869
## .jpg .
## .png .
dim(blasso)
## [1] 29 1
zblasso <- blasso[-1] * apply(x,2,sd)
zbolt <- coef (full_2) [-1] * apply (x,2,sd)
## Warning in coef(full_2)[-1] * apply(x, 2, sd): longer object length is not
## a multiple of shorter object length
azbolt <- abs(zbolt)
sum(azbolt)
## [1] 5970.87
s <- sum(abs(zblasso))/sum(abs(azbolt))
s
## [1] 0.005288955
full_3 <- lm(Revenues~1 +X8x15 +X44x556 +X800x1200 +X24pxx133px +X50x45 +X400x300 +X60x60 +.bmp +.dib ,data=total_500_final_train)
summary(full_3)
##
## Call:
## lm(formula = Revenues ~ 1 + X8x15 + X44x556 + X800x1200 + X24pxx133px +
## X50x45 + X400x300 + X60x60 + .bmp + .dib, data = total_500_final_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.892 -8.676 -4.842 1.878 66.817
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 428.8923 20.2160 21.215 < 2e-16 ***
## X8x15 -41.0270 17.4942 -2.345 0.0196 *
## X44x556 -24.1340 20.2005 -1.195 0.2330
## X800x1200 -5.3723 16.4937 -0.326 0.7448
## X24pxx133px -15.7302 13.0394 -1.206 0.2285
## X50x45 -4.8865 17.4942 -0.279 0.7802
## X400x300 -29.5016 15.1628 -1.946 0.0525 .
## X60x60 -85.9726 5.1401 -16.726 < 2e-16 ***
## .bmp 2.5989 0.6183 4.203 3.37e-05 ***
## .dib 0.9183 0.3771 2.435 0.0154 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.28 on 338 degrees of freedom
## Multiple R-squared: 0.8032, Adjusted R-squared: 0.7979
## F-statistic: 153.2 on 9 and 338 DF, p-value: < 2.2e-16
ad_r_sq_f3 <- summary(full_3)$adj.r.squared
aic_f3 <- AIC(full_3)
plot(full_3,which=1:3)
## Warning: not plotting observations with leverage one:
## 4



##############################################
blassob <- coef(lassob, s="lambda.1se")
blassob
## 29 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 317.9194224
## (Intercept) .
## non.document.error .
## number_of_errors .
## number_of_warning .
## facebook .
## instagram .
## linkedin .
## pinterest .
## twitter .
## youtube .
## Flesh_Mesaure .
## Readability .
## Sentences .
## Unique.words .
## Words .
## external .
## internal .
## X8x15 -10.5491256
## X44x556 -20.5169976
## X800x1200 -0.9339037
## X24pxx133px -12.8259711
## X50x45 -3.5213411
## X400x300 -25.8836893
## X60x60 -76.2263582
## .bmp .
## .dib .
## .jpg .
## .png .
zblassob <- blassob[-1] * apply(x,2,sd)
zboltb <- coef (full_2) [-1] * apply (x,2,sd)
## Warning in coef(full_2)[-1] * apply(x, 2, sd): longer object length is not
## a multiple of shorter object length
s <- sum(abs(zblassob))/sum(abs(zboltb))
s
## [1] 0.004421009
#The model based on the lasso method by taking the lambda.1se is the null model only with the intercept
full_4 <- lm(Revenues~1 +X8x15 +X44x556 +X800x1200 +X24pxx133px +X50x45 +X400x300 +X60x60 ,data=total_500_final_train)
summary(full_4)
##
## Call:
## lm(formula = Revenues ~ 1 + X8x15 + X44x556 + X800x1200 + X24pxx133px +
## X50x45 + X400x300 + X60x60, data = total_500_final_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.940 -9.004 -5.191 1.943 66.430
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 428.505 20.847 20.554 <2e-16 ***
## X8x15 -41.027 18.041 -2.274 0.0236 *
## X44x556 -24.134 20.831 -1.159 0.2475
## X800x1200 -5.372 17.009 -0.316 0.7523
## X24pxx133px -15.730 13.447 -1.170 0.2429
## X50x45 -4.887 18.041 -0.271 0.7867
## X400x300 -28.009 15.624 -1.793 0.0739 .
## X60x60 -87.077 5.271 -16.521 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.73 on 340 degrees of freedom
## Multiple R-squared: 0.7894, Adjusted R-squared: 0.7851
## F-statistic: 182.1 on 7 and 340 DF, p-value: < 2.2e-16
ad_r_sq_f4 <- summary(full_4)$adj.r.squared
aic_f4 <- AIC(full_4)
plot(full_4,which=1:3)
## Warning: not plotting observations with leverage one:
## 4



###############################################
#We use the "both" method to compare the full_3 model with the null model to see how many variables are indeed important
model_a <- step(model_null, scope = list(lower = model_null, upper=full_2), direction = "both")
## Start: AIC=2408.24
## Revenues ~ 1
##
## Df Sum of Sq RSS AIC
## + X60x60 1 249797 100551 1975.8
## + X115x223 1 247381 102967 1984.1
## + X29x29 1 237600 112748 2015.7
## + X292pxx292px 1 220892 129456 2063.8
## + X400x300 1 207118 143231 2099.0
## + X50x45 1 195591 154758 2125.9
## + X24pxx133px 1 170701 179647 2177.8
## + autox100. 1 154606 195743 2207.7
## + X800x1200 1 121714 228634 2261.7
## + X44x556 1 105033 245316 2286.2
## + X8x15 1 80446 269903 2319.5
## + X15x75 1 44830 305518 2362.6
## + loading.time 1 4505 345844 2405.7
## + number_of_warning 1 2937 347411 2407.3
## + .bmp 1 2746 347602 2407.5
## + .dib 1 2728 347620 2407.5
## + non.document.error 1 2320 348028 2407.9
## <none> 350348 2408.2
## + number_of_errors 1 1473 348875 2408.8
## + .jpe 1 1398 348951 2408.8
## + .jpeg 1 1384 348964 2408.9
## + Words 1 1122 349226 2409.1
## + Unique.words 1 1019 349329 2409.2
## + pinterest 1 909 349439 2409.3
## + Sentences 1 824 349525 2409.4
## + youtube 1 648 349701 2409.6
## + instagram 1 448 349900 2409.8
## + .gif 1 245 350103 2410.0
## + .png 1 217 350132 2410.0
## + .tiff 1 211 350138 2410.0
## + Flesh_Mesaure 1 199 350150 2410.0
## + facebook 1 140 350209 2410.1
## + linkedin 1 134 350214 2410.1
## + twitter 1 126 350222 2410.1
## + internal 1 29 350320 2410.2
## + .jpg 1 24 350324 2410.2
## + .tif 1 9 350340 2410.2
## + external 1 1 350348 2410.2
## + Readability 1 1 350348 2410.2
##
## Step: AIC=1975.85
## Revenues ~ X60x60
##
## Df Sum of Sq RSS AIC
## + X44x556 1 18449 82102 1907.3
## + X24pxx133px 1 18204 82348 1908.3
## + autox100. 1 17756 82796 1910.2
## + X800x1200 1 17731 82821 1910.3
## + X50x45 1 17058 83494 1913.2
## + X400x300 1 16545 84007 1915.3
## + X8x15 1 16357 84194 1916.1
## + X292pxx292px 1 12453 88098 1931.8
## + X15x75 1 9903 90648 1941.8
## + X29x29 1 6348 94203 1955.2
## + .bmp 1 3599 96952 1965.2
## + loading.time 1 2728 97823 1968.3
## + X115x223 1 2630 97922 1968.6
## + .jpe 1 1382 99170 1973.0
## + .jpeg 1 1380 99171 1973.0
## + youtube 1 1026 99526 1974.3
## + non.document.error 1 813 99739 1975.0
## <none> 100551 1975.8
## + instagram 1 543 100009 1976.0
## + .dib 1 528 100023 1976.0
## + Readability 1 468 100083 1976.2
## + number_of_errors 1 274 100278 1976.9
## + linkedin 1 269 100283 1976.9
## + external 1 196 100356 1977.2
## + number_of_warning 1 97 100455 1977.5
## + twitter 1 50 100502 1977.7
## + .gif 1 19 100532 1977.8
## + Flesh_Mesaure 1 18 100534 1977.8
## + .tiff 1 16 100535 1977.8
## + Words 1 10 100541 1977.8
## + internal 1 7 100545 1977.8
## + Unique.words 1 3 100549 1977.8
## + .tif 1 3 100549 1977.8
## + pinterest 1 2 100549 1977.8
## + .png 1 2 100550 1977.8
## + facebook 1 1 100550 1977.8
## + .jpg 1 1 100551 1977.8
## + Sentences 1 0 100551 1977.8
## - X60x60 1 249797 350348 2408.2
##
## Step: AIC=1907.3
## Revenues ~ X60x60 + X44x556
##
## Df Sum of Sq RSS AIC
## + X400x300 1 6571 75531 1880.3
## + X50x45 1 6101 76002 1882.4
## + X292pxx292px 1 5235 76867 1886.4
## + X24pxx133px 1 5038 77064 1887.3
## + autox100. 1 3736 78366 1893.1
## + .bmp 1 3599 78503 1893.7
## + X29x29 1 2872 79230 1896.9
## + loading.time 1 2461 79641 1898.7
## + .jpe 1 1450 80653 1903.1
## + .jpeg 1 1446 80656 1903.1
## + X115x223 1 1352 80751 1903.5
## + X800x1200 1 1269 80833 1903.9
## + X8x15 1 1122 80980 1904.5
## + X15x75 1 947 81155 1905.3
## + youtube 1 836 81266 1905.7
## + .dib 1 778 81324 1906.0
## + non.document.error 1 535 81567 1907.0
## <none> 82102 1907.3
## + instagram 1 386 81716 1907.7
## + number_of_errors 1 383 81719 1907.7
## + Readability 1 304 81798 1908.0
## + external 1 267 81835 1908.2
## + twitter 1 235 81867 1908.3
## + number_of_warning 1 121 81981 1908.8
## + linkedin 1 104 81998 1908.9
## + pinterest 1 89 82013 1908.9
## + facebook 1 50 82052 1909.1
## + .png 1 48 82054 1909.1
## + .jpg 1 35 82067 1909.2
## + Flesh_Mesaure 1 19 82083 1909.2
## + .tiff 1 16 82086 1909.2
## + Words 1 15 82088 1909.2
## + .tif 1 14 82088 1909.2
## + internal 1 8 82094 1909.3
## + .gif 1 3 82099 1909.3
## + Sentences 1 1 82101 1909.3
## + Unique.words 1 0 82102 1909.3
## - X44x556 1 18449 100551 1975.8
## - X60x60 1 163213 245316 2286.2
##
## Step: AIC=1880.27
## Revenues ~ X60x60 + X44x556 + X400x300
##
## Df Sum of Sq RSS AIC
## + .bmp 1 3599 71932 1865.3
## + loading.time 1 1874 73657 1873.5
## + .jpe 1 1345 74186 1876.0
## + .jpeg 1 1344 74187 1876.0
## + .dib 1 1205 74326 1876.7
## + X8x15 1 1122 74409 1877.1
## + X15x75 1 947 74584 1877.9
## + X24pxx133px 1 600 74931 1879.5
## + youtube 1 538 74993 1879.8
## + autox100. 1 443 75088 1880.2
## <none> 75531 1880.3
## + X29x29 1 315 75216 1880.8
## + X115x223 1 290 75241 1880.9
## + instagram 1 271 75260 1881.0
## + Readability 1 268 75263 1881.0
## + X292pxx292px 1 262 75269 1881.1
## + non.document.error 1 249 75282 1881.1
## + X50x45 1 227 75304 1881.2
## + external 1 209 75322 1881.3
## + number_of_errors 1 205 75326 1881.3
## + X800x1200 1 169 75362 1881.5
## + pinterest 1 156 75375 1881.5
## + twitter 1 79 75452 1881.9
## + .jpg 1 76 75455 1881.9
## + .tif 1 28 75503 1882.2
## + Flesh_Mesaure 1 23 75508 1882.2
## + internal 1 18 75513 1882.2
## + .png 1 17 75514 1882.2
## + .tiff 1 16 75515 1882.2
## + .gif 1 9 75522 1882.2
## + Words 1 5 75526 1882.2
## + Sentences 1 4 75527 1882.2
## + number_of_warning 1 2 75529 1882.3
## + Unique.words 1 2 75529 1882.3
## + facebook 1 1 75530 1882.3
## + linkedin 1 0 75531 1882.3
## - X400x300 1 6571 82102 1907.3
## - X44x556 1 8476 84007 1915.3
## - X60x60 1 59224 134755 2079.7
##
## Step: AIC=1865.28
## Revenues ~ X60x60 + X44x556 + X400x300 + .bmp
##
## Df Sum of Sq RSS AIC
## + loading.time 1 1725 70207 1858.8
## + .jpe 1 1363 70569 1860.6
## + .jpeg 1 1362 70570 1860.6
## + .dib 1 1210 70722 1861.4
## + X8x15 1 1122 70810 1861.8
## + X15x75 1 947 70985 1862.7
## + X24pxx133px 1 600 71332 1864.4
## + autox100. 1 443 71489 1865.1
## <none> 71932 1865.3
## + youtube 1 398 71534 1865.3
## + X29x29 1 315 71617 1865.8
## + instagram 1 310 71622 1865.8
## + X115x223 1 290 71642 1865.9
## + X292pxx292px 1 262 71670 1866.0
## + X50x45 1 227 71705 1866.2
## + Readability 1 224 71708 1866.2
## + non.document.error 1 202 71730 1866.3
## + number_of_errors 1 192 71740 1866.3
## + pinterest 1 176 71756 1866.4
## + twitter 1 175 71757 1866.4
## + X800x1200 1 169 71763 1866.5
## + external 1 151 71781 1866.5
## + .jpg 1 63 71869 1867.0
## + Words 1 32 71900 1867.1
## + facebook 1 22 71910 1867.2
## + Flesh_Mesaure 1 20 71912 1867.2
## + .png 1 17 71915 1867.2
## + linkedin 1 15 71917 1867.2
## + .tiff 1 14 71918 1867.2
## + .gif 1 7 71925 1867.2
## + .tif 1 5 71927 1867.3
## + Unique.words 1 4 71928 1867.3
## + Sentences 1 3 71928 1867.3
## + internal 1 2 71929 1867.3
## + number_of_warning 1 1 71930 1867.3
## - .bmp 1 3599 75531 1880.3
## - X400x300 1 6571 78503 1893.7
## - X44x556 1 8476 80408 1902.0
## - X60x60 1 59518 131450 2073.1
##
## Step: AIC=1858.84
## Revenues ~ X60x60 + X44x556 + X400x300 + .bmp + loading.time
##
## Df Sum of Sq RSS AIC
## + .jpeg 1 1425 68782 1853.7
## + .jpe 1 1425 68782 1853.7
## + .dib 1 1370 68837 1854.0
## + X8x15 1 951 69256 1856.1
## + X15x75 1 870 69337 1856.5
## + X24pxx133px 1 671 69535 1857.5
## + autox100. 1 499 69708 1858.3
## + youtube 1 451 69756 1858.6
## <none> 70207 1858.8
## + instagram 1 333 69874 1859.2
## + external 1 327 69880 1859.2
## + Readability 1 296 69910 1859.4
## + number_of_errors 1 292 69915 1859.4
## + twitter 1 261 69946 1859.5
## + X800x1200 1 258 69949 1859.6
## + X50x45 1 245 69962 1859.6
## + X29x29 1 230 69977 1859.7
## + X292pxx292px 1 194 70013 1859.9
## + Words 1 176 70031 1860.0
## + pinterest 1 150 70057 1860.1
## + non.document.error 1 149 70058 1860.1
## + .jpg 1 145 70062 1860.1
## + X115x223 1 137 70070 1860.2
## + Unique.words 1 114 70093 1860.3
## + internal 1 109 70098 1860.3
## + .png 1 91 70116 1860.4
## + linkedin 1 82 70125 1860.4
## + Sentences 1 60 70147 1860.5
## + .tif 1 29 70178 1860.7
## + facebook 1 22 70185 1860.7
## + .gif 1 11 70196 1860.8
## + number_of_warning 1 10 70197 1860.8
## + .tiff 1 5 70202 1860.8
## + Flesh_Mesaure 1 3 70204 1860.8
## - loading.time 1 1725 71932 1865.3
## - .bmp 1 3450 73657 1873.5
## - X400x300 1 6006 76213 1885.4
## - X44x556 1 8582 78789 1897.0
## - X60x60 1 60220 130427 2072.4
##
## Step: AIC=1853.7
## Revenues ~ X60x60 + X44x556 + X400x300 + .bmp + loading.time +
## .jpeg
##
## Df Sum of Sq RSS AIC
## + X8x15 1 948 67834 1850.9
## + X15x75 1 868 67914 1851.3
## + X24pxx133px 1 635 68148 1852.5
## + autox100. 1 535 68247 1853.0
## + Readability 1 476 68306 1853.3
## <none> 68782 1853.7
## + youtube 1 377 68405 1853.8
## + X800x1200 1 271 68511 1854.3
## + number_of_errors 1 242 68540 1854.5
## + X50x45 1 234 68548 1854.5
## + X29x29 1 229 68553 1854.5
## + twitter 1 217 68565 1854.6
## + instagram 1 204 68578 1854.7
## + X292pxx292px 1 192 68590 1854.7
## + .gif 1 160 68622 1854.9
## + .jpg 1 156 68626 1854.9
## + Words 1 151 68631 1854.9
## + linkedin 1 138 68644 1855.0
## + X115x223 1 135 68647 1855.0
## + Unique.words 1 130 68652 1855.0
## + external 1 125 68658 1855.1
## + non.document.error 1 120 68662 1855.1
## + internal 1 55 68727 1855.4
## + Sentences 1 46 68736 1855.5
## + .dib 1 35 68747 1855.5
## + pinterest 1 34 68748 1855.5
## + .tif 1 26 68756 1855.6
## + number_of_warning 1 18 68764 1855.6
## + facebook 1 10 68772 1855.7
## + .tiff 1 4 68778 1855.7
## + Flesh_Mesaure 1 1 68781 1855.7
## + .jpe 1 0 68782 1855.7
## + .png 1 0 68782 1855.7
## - .jpeg 1 1425 70207 1858.8
## - loading.time 1 1788 70570 1860.6
## - .bmp 1 3465 72247 1868.8
## - X400x300 1 5897 74679 1880.3
## - X44x556 1 8671 77453 1893.0
## - X60x60 1 60403 129185 2071.1
##
## Step: AIC=1850.87
## Revenues ~ X60x60 + X44x556 + X400x300 + .bmp + loading.time +
## .jpeg + X8x15
##
## Df Sum of Sq RSS AIC
## + X24pxx133px 1 631 67203 1849.6
## + Readability 1 571 67263 1849.9
## + autox100. 1 532 67302 1850.1
## + youtube 1 431 67403 1850.7
## <none> 67834 1850.9
## + twitter 1 310 67524 1851.3
## + instagram 1 309 67525 1851.3
## + number_of_errors 1 275 67559 1851.5
## + X800x1200 1 266 67568 1851.5
## + X15x75 1 265 67569 1851.5
## + X50x45 1 233 67601 1851.7
## + X29x29 1 233 67601 1851.7
## + X292pxx292px 1 196 67639 1851.9
## + Unique.words 1 173 67661 1852.0
## + Words 1 169 67665 1852.0
## + linkedin 1 166 67668 1852.0
## + .gif 1 162 67673 1852.0
## + .jpg 1 156 67678 1852.1
## + X115x223 1 141 67693 1852.2
## + non.document.error 1 122 67712 1852.2
## + external 1 119 67715 1852.3
## + Sentences 1 65 67770 1852.5
## + internal 1 51 67784 1852.6
## + facebook 1 36 67798 1852.7
## + pinterest 1 35 67799 1852.7
## + .dib 1 33 67801 1852.7
## + .tif 1 25 67809 1852.7
## + .tiff 1 4 67830 1852.8
## + Flesh_Mesaure 1 1 67833 1852.9
## + number_of_warning 1 0 67834 1852.9
## + .png 1 0 67834 1852.9
## + .jpe 1 0 67834 1852.9
## - X8x15 1 948 68782 1853.7
## - X44x556 1 1330 69164 1855.6
## - .jpeg 1 1422 69256 1856.1
## - loading.time 1 1613 69448 1857.0
## - .bmp 1 3472 71306 1866.2
## - X400x300 1 5922 73756 1878.0
## - X60x60 1 60364 128198 2070.4
##
## Step: AIC=1849.62
## Revenues ~ X60x60 + X44x556 + X400x300 + .bmp + loading.time +
## .jpeg + X8x15 + X24pxx133px
##
## Df Sum of Sq RSS AIC
## + Readability 1 660 66543 1848.2
## + youtube 1 420 66783 1849.4
## <none> 67203 1849.6
## + instagram 1 324 66880 1849.9
## + number_of_errors 1 275 66928 1850.2
## + twitter 1 269 66934 1850.2
## + X15x75 1 265 66938 1850.2
## + X29x29 1 231 66972 1850.4
## + X292pxx292px 1 194 67009 1850.6
## + Unique.words 1 179 67024 1850.7
## + .gif 1 177 67026 1850.7
## + linkedin 1 160 67043 1850.8
## + Words 1 159 67044 1850.8
## - X24pxx133px 1 631 67834 1850.9
## + .jpg 1 142 67062 1850.9
## + X115x223 1 139 67065 1850.9
## + non.document.error 1 121 67082 1851.0
## + external 1 105 67098 1851.1
## + pinterest 1 65 67138 1851.3
## + X800x1200 1 64 67140 1851.3
## + Sentences 1 63 67140 1851.3
## + internal 1 50 67153 1851.4
## + .dib 1 49 67154 1851.4
## + autox100. 1 41 67163 1851.4
## + facebook 1 23 67180 1851.5
## + .tif 1 23 67181 1851.5
## - X44x556 1 761 67964 1851.5
## + X50x45 1 15 67188 1851.5
## + .tiff 1 4 67199 1851.6
## + .png 1 3 67200 1851.6
## + number_of_warning 1 1 67202 1851.6
## + Flesh_Mesaure 1 1 67202 1851.6
## + .jpe 1 0 67203 1851.6
## - X8x15 1 944 68148 1852.5
## - .jpeg 1 1385 68588 1854.7
## - loading.time 1 1680 68883 1856.2
## - X400x300 1 1817 69020 1856.9
## - .bmp 1 3469 70672 1865.1
## - X60x60 1 60378 127581 2070.7
##
## Step: AIC=1848.19
## Revenues ~ X60x60 + X44x556 + X400x300 + .bmp + loading.time +
## .jpeg + X8x15 + X24pxx133px + Readability
##
## Df Sum of Sq RSS AIC
## + instagram 1 510 66033 1847.5
## + number_of_errors 1 385 66159 1848.2
## <none> 66543 1848.2
## + youtube 1 380 66163 1848.2
## + X29x29 1 270 66273 1848.8
## + X15x75 1 265 66278 1848.8
## + twitter 1 224 66319 1849.0
## + X292pxx292px 1 198 66345 1849.2
## + pinterest 1 178 66365 1849.2
## + non.document.error 1 160 66383 1849.3
## + X115x223 1 159 66384 1849.3
## + .jpg 1 158 66385 1849.4
## + .gif 1 141 66402 1849.5
## - X44x556 1 645 67189 1849.5
## - Readability 1 660 67203 1849.6
## + external 1 92 66451 1849.7
## + .dib 1 87 66456 1849.7
## + Words 1 85 66458 1849.7
## + X800x1200 1 61 66482 1849.9
## + linkedin 1 51 66492 1849.9
## - X24pxx133px 1 720 67263 1849.9
## + Unique.words 1 43 66501 1850.0
## + X50x45 1 27 66516 1850.0
## + Flesh_Mesaure 1 26 66518 1850.0
## + internal 1 21 66523 1850.1
## + facebook 1 17 66526 1850.1
## + Sentences 1 17 66526 1850.1
## + .tiff 1 15 66528 1850.1
## + .tif 1 14 66529 1850.1
## + autox100. 1 9 66534 1850.1
## + .jpe 1 5 66539 1850.2
## + .png 1 4 66539 1850.2
## + number_of_warning 1 1 66543 1850.2
## - X8x15 1 1047 67590 1851.6
## - .jpeg 1 1597 68140 1854.4
## - X400x300 1 1683 68227 1854.9
## - loading.time 1 1792 68335 1855.4
## - .bmp 1 3391 69934 1863.5
## - X60x60 1 60871 127414 2072.2
##
## Step: AIC=1847.51
## Revenues ~ X60x60 + X44x556 + X400x300 + .bmp + loading.time +
## .jpeg + X8x15 + X24pxx133px + Readability + instagram
##
## Df Sum of Sq RSS AIC
## <none> 66033 1847.5
## + number_of_errors 1 328 65705 1847.8
## + X29x29 1 324 65709 1847.8
## + X15x75 1 265 65768 1848.1
## - instagram 1 510 66543 1848.2
## + X115x223 1 245 65788 1848.2
## - X44x556 1 528 66561 1848.3
## + X292pxx292px 1 216 65817 1848.4
## + .gif 1 181 65852 1848.5
## + youtube 1 157 65877 1848.7
## + non.document.error 1 142 65891 1848.8
## + .jpg 1 81 65953 1849.1
## + .dib 1 76 65958 1849.1
## + X800x1200 1 74 65959 1849.1
## + Words 1 69 65964 1849.1
## + external 1 68 65965 1849.2
## + twitter 1 62 65971 1849.2
## + pinterest 1 38 65995 1849.3
## + Unique.words 1 36 65998 1849.3
## + autox100. 1 26 66008 1849.4
## + .tiff 1 25 66008 1849.4
## + Flesh_Mesaure 1 20 66013 1849.4
## + X50x45 1 18 66015 1849.4
## + facebook 1 18 66016 1849.4
## + .png 1 17 66016 1849.4
## + Sentences 1 17 66016 1849.4
## + internal 1 16 66017 1849.4
## + linkedin 1 12 66021 1849.4
## - X24pxx133px 1 754 66788 1849.5
## + .jpe 1 5 66028 1849.5
## + .tif 1 4 66030 1849.5
## + number_of_warning 1 3 66030 1849.5
## - Readability 1 846 66880 1849.9
## - X8x15 1 1208 67241 1851.8
## - .jpeg 1 1426 67459 1852.9
## - X400x300 1 1587 67620 1853.8
## - loading.time 1 1823 67856 1855.0
## - .bmp 1 3429 69462 1863.1
## - X60x60 1 61259 127292 2073.9
summary(model_a)
##
## Call:
## lm(formula = Revenues ~ X60x60 + X44x556 + X400x300 + .bmp +
## loading.time + .jpeg + X8x15 + X24pxx133px + Readability +
## instagram, data = total_500_final_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.386 -8.578 -3.882 2.357 66.268
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 422.07586 20.00250 21.101 < 2e-16 ***
## X60x60 -88.85405 5.02526 -17.681 < 2e-16 ***
## X44x556 -25.93785 15.80312 -1.641 0.10167
## X400x300 -27.14161 9.53846 -2.845 0.00471 **
## .bmp 2.53711 0.60649 4.183 3.67e-05 ***
## loading.time -5.41135 1.77406 -3.050 0.00247 **
## .jpeg 0.03943 0.01462 2.698 0.00734 **
## X8x15 -43.04916 17.33589 -2.483 0.01351 *
## X24pxx133px -21.05205 10.72927 -1.962 0.05057 .
## Readability 1.25782 0.60521 2.078 0.03844 *
## instagram 2.95626 1.83217 1.614 0.10757
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14 on 337 degrees of freedom
## Multiple R-squared: 0.8115, Adjusted R-squared: 0.8059
## F-statistic: 145.1 on 10 and 337 DF, p-value: < 2.2e-16
ad_r_sq_ma <- summary(model_a)$adj.r.squared
aic_ma <- AIC(model_a)
#We create the 2 basic plots so as to be able to explain the regression model
plot(model_a,which=1:3)
## Warning: not plotting observations with leverage one:
## 3



################
#We compare the Adjusted R squares of the models and also the AIC of the models we created to find the best one
ad_r_sq_f3
## [1] 0.797919
ad_r_sq_f4
## [1] 0.7850981
ad_r_sq_ma
## [1] 0.8059286
#The best Adkusted R square is the one in full 3 (the closer to 1 the better)
aic_f3
## [1] 2850.194
aic_f4
## [1] 2869.653
aic_ma
## [1] 2837.088
#The best AIC and the best Adjusted R square is for model ma
#######################################################################################################
par(mfrow=c(2,2))
Actual_Revenues<- total_500_final_test$Revenues
plot (Actual_Revenues, col = "blue")
###########################################
predictions_ma <- predict(model_a,total_500_final_test)
plot (predictions_ma, col = "Red",main = "Model a")
#####################################
predictions_full3 <- predict(full_3,total_500_final_test)
plot (predictions_full3, col = "Red",main = "Full_3 model")
#####################################
predictions_full4 <- predict(full_4,total_500_final_test)
plot (predictions_full4, col = "Red",main = "Full_4 model")

#####################################
#From the plots above we can see that the actual Revenues have a more smooth way of leveling up except from the Revenues of the #1 ranking company that are extremely high in relationship with the other sites.
#The prediction model that is more smooth is the model a which has as we said before the best Adjusted R Square and the best AIC price
par(mfrow=c(1,1))
total_500_final_reg <- total_500_final_train[,c(1,6,12,20,21,25,30,42,43,47,53)]
corrplot(cor(total_500_final_reg),method="number")

#We can see here that the variable x8x15 has a very high correlation with the variable x44x556 and also the variable x24pxx133px has also a very high correlation with the variable x400x300.
#So we can try creating a new model excluding the 2 variables that are correlated from each pair to see if there will be any improvement in the model
full_5 <- lm(Revenues~1 +X60x60 +X44x556 +X400x300 + .bmp +loading.time + .jpeg + Readability + instagram ,data=total_500_final_train)
summary(full_5)
##
## Call:
## lm(formula = Revenues ~ 1 + X60x60 + X44x556 + X400x300 + .bmp +
## loading.time + .jpeg + Readability + instagram, data = total_500_final_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.944 -8.620 -4.054 2.701 65.967
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 394.88728 16.79045 23.519 < 2e-16 ***
## X60x60 -88.71740 5.08374 -17.451 < 2e-16 ***
## X44x556 -63.78453 9.77718 -6.524 2.50e-10 ***
## X400x300 -39.25886 7.36190 -5.333 1.77e-07 ***
## .bmp 2.53695 0.61357 4.135 4.48e-05 ***
## loading.time -5.54201 1.79029 -3.096 0.00213 **
## .jpeg 0.03997 0.01479 2.703 0.00722 **
## Readability 1.04666 0.60852 1.720 0.08634 .
## instagram 2.32848 1.84028 1.265 0.20664
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.16 on 339 degrees of freedom
## Multiple R-squared: 0.8059, Adjusted R-squared: 0.8014
## F-statistic: 176 on 8 and 339 DF, p-value: < 2.2e-16
adj_r_square_full5 <- summary(full_5)$adj.r.squared
aic_full5 <- AIC(full_5)
#We create the 2 basic plots so as to be able to explain the regression model
plot(full_5,which=1:3)



ad_r_sq_ma
## [1] 0.8059286
adj_r_square_full5
## [1] 0.8013705
aic_ma
## [1] 2837.088
aic_full5
## [1] 2843.226
#The adjusted R square and the aic are a little worse than before
#######################################################################################################
##################################################################################################
#Clustering
#Based on those results we will try to cluster the companies based on the results of the regression
set.seed(220)
fortuneCluster <- kmeans(total_500_final_reg[, 1:11], 3, iter.max = 100,nstart = 1)
cluster <- table(fortuneCluster$cluster)
fortuneCluster$cluster <- as.factor(fortuneCluster$cluster)
ggplot(total_500_final_reg, aes(Revenues, loading.time, color = fortuneCluster$cluster)) + geom_point(size=3)

ggplot(total_500_final_reg, aes(Revenues, Readability, color = fortuneCluster$cluster)) + geom_point(size=3)

ggplot(total_500_final_reg, aes(Revenues, instagram, color = fortuneCluster$cluster)) + geom_point(size=3)

ggplot(total_500_final_reg, aes(Revenues, .bmp, color = fortuneCluster$cluster)) + geom_point(size=3)

ggplot(total_500_final_reg, aes(Revenues, .jpeg, color = fortuneCluster$cluster)) + geom_point(size=3)

ggplot(total_500_final_reg, aes(Revenues, X60x60, color = fortuneCluster$cluster)) + geom_point(size=3)

ggplot(total_500_final_reg, aes(Revenues, X44x556, color = fortuneCluster$cluster)) + geom_point(size=3)

ggplot(total_500_final_reg, aes(Revenues, X400x300, color = fortuneCluster$cluster)) + geom_point(size=3)

#From the clustering we can see that the variables do indeed devide the most high revenues from the smallest ones
summary(model_a)
##
## Call:
## lm(formula = Revenues ~ X60x60 + X44x556 + X400x300 + .bmp +
## loading.time + .jpeg + X8x15 + X24pxx133px + Readability +
## instagram, data = total_500_final_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.386 -8.578 -3.882 2.357 66.268
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 422.07586 20.00250 21.101 < 2e-16 ***
## X60x60 -88.85405 5.02526 -17.681 < 2e-16 ***
## X44x556 -25.93785 15.80312 -1.641 0.10167
## X400x300 -27.14161 9.53846 -2.845 0.00471 **
## .bmp 2.53711 0.60649 4.183 3.67e-05 ***
## loading.time -5.41135 1.77406 -3.050 0.00247 **
## .jpeg 0.03943 0.01462 2.698 0.00734 **
## X8x15 -43.04916 17.33589 -2.483 0.01351 *
## X24pxx133px -21.05205 10.72927 -1.962 0.05057 .
## Readability 1.25782 0.60521 2.078 0.03844 *
## instagram 2.95626 1.83217 1.614 0.10757
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14 on 337 degrees of freedom
## Multiple R-squared: 0.8115, Adjusted R-squared: 0.8059
## F-statistic: 145.1 on 10 and 337 DF, p-value: < 2.2e-16
#We can see from the model that the basic variable that effect a companys ranking is whether or not it has an image in size X60x60
#We will try to make a model that we will not take into consideration this variable at all just in order to see how it will explain the revenues
full_6 <- lm(Revenues~1 +X44x556 +X400x300 + .bmp +loading.time + .jpeg + Readability + instagram ,data=total_500_final_train)
summary(full_6)
##
## Call:
## lm(formula = Revenues ~ 1 + X44x556 + X400x300 + .bmp + loading.time +
## .jpeg + Readability + instagram, data = total_500_final_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.704 -10.830 -6.491 1.438 98.117
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 396.90230 23.09946 17.182 < 2e-16 ***
## X44x556 -63.96097 13.45126 -4.755 2.94e-06 ***
## X400x300 -126.11557 7.46301 -16.899 < 2e-16 ***
## .bmp 2.44616 0.84411 2.898 0.0040 **
## loading.time -4.19747 2.46076 -1.706 0.0890 .
## .jpeg 0.03687 0.02034 1.813 0.0707 .
## Readability 0.42768 0.83576 0.512 0.6092
## instagram 0.84978 2.52914 0.336 0.7371
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.48 on 340 degrees of freedom
## Multiple R-squared: 0.6316, Adjusted R-squared: 0.624
## F-statistic: 83.28 on 7 and 340 DF, p-value: < 2.2e-16
adj_r_square_full6 <- summary(full_6)$adj.r.squared
aic_full6 <- AIC(full_6)
#We create the 2 basic plots so as to be able to explain the regression model
plot(full_6,which=1:3)



predictions_ma <- predict(model_a,total_500_final_test)
Actual_Revenues<- total_500_final_test$Revenues
par(mfrow=c(2,2))
plot (Actual_Revenues, col = "blue")
plot (predictions_ma, col = "Red",main = "Model A")
#####################################
predictions_full_6 <- predict(full_6,total_500_final_test)
plot (predictions_full_6, col = "Red",main = "Full_6 model")
#######################################################

#We can see that here the prediction of the new model is not as good as the previous one so now that we have checked this option as well we can conclude that the most important factors are the ones of model_a
summary(model_a)
##
## Call:
## lm(formula = Revenues ~ X60x60 + X44x556 + X400x300 + .bmp +
## loading.time + .jpeg + X8x15 + X24pxx133px + Readability +
## instagram, data = total_500_final_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.386 -8.578 -3.882 2.357 66.268
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 422.07586 20.00250 21.101 < 2e-16 ***
## X60x60 -88.85405 5.02526 -17.681 < 2e-16 ***
## X44x556 -25.93785 15.80312 -1.641 0.10167
## X400x300 -27.14161 9.53846 -2.845 0.00471 **
## .bmp 2.53711 0.60649 4.183 3.67e-05 ***
## loading.time -5.41135 1.77406 -3.050 0.00247 **
## .jpeg 0.03943 0.01462 2.698 0.00734 **
## X8x15 -43.04916 17.33589 -2.483 0.01351 *
## X24pxx133px -21.05205 10.72927 -1.962 0.05057 .
## Readability 1.25782 0.60521 2.078 0.03844 *
## instagram 2.95626 1.83217 1.614 0.10757
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14 on 337 degrees of freedom
## Multiple R-squared: 0.8115, Adjusted R-squared: 0.8059
## F-statistic: 145.1 on 10 and 337 DF, p-value: < 2.2e-16